home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
- Brewster@think.com
- */
-
- /* Copyright (c) CNIDR (see ../COPYRIGHT) */
-
-
- /* Looks up words in the inverted file index.
- *
- * Important functions:
- * run_search
- * search_for_words
- *
- * to do:
- * Handle searches on multiple databases
- */
-
- /* Change Log:
- * $Log: irsearch.c,v $
- * Revision 1.5 1993/10/17 15:38:50 huynh1
- * new code added for soundex, phonix retrieval and
- * nested boolean queries.
- *
- * Revision 1.3 93/07/21 18:46:35 warnock
- * Added STELAR-specific patches
- *
- * Revision 1.2 93/07/02 18:04:26 warnock
- * replace handle_next_and_previous for multi-type from francois
- *
- * Revision 1.1 1993/02/16 15:05:35 freewais
- * Initial revision
- *
- * Revision 1.54 92/05/10 14:44:35 jonathan
- * Made a little safer on NULL docid's when parsing.
- *
- * Revision 1.53 92/05/04 17:20:11 jonathan
- * Added test for parsing docids (if null, log error).
- *
- * Revision 1.52 92/04/29 08:22:17 shen
- * declare global variable "_BE_normalized" to allow turning on/off FE score
- * normalization.
- *
- * Revision 1.51 92/04/28 16:56:30 morris
- * added boolean to serial engine
- *
- * Revision 1.50 92/04/01 17:10:21 jonathan
- * ?
- *
- * Revision 1.49 92/03/23 13:26:27 shen
- * add timing for query. Compile with GET_QUERY_TIMING. print timing every 200 queries.
- *
- * Revision 1.48 92/03/18 08:56:00 jonathan
- * Removed databaseName argument to getDocumentText and getData.
- *
- * Revision 1.47 92/02/17 16:22:42 jonathan
- * Added WCAT to types that can be used for relevance feedback.
- *
- * Revision 1.46 92/02/16 18:04:38 jonathan
- * Demoted more WLOG_ERROR's to WLOG_WARNING's
- *
- * Revision 1.45 92/02/16 09:51:12 jonathan
- * Plugged some memory leaks. I be there are more.
- *
- * Revision 1.44 92/02/15 19:41:20 jonathan
- * Improved logging for invalid relevant documents.
- *
- * Revision 1.43 92/02/14 16:06:48 jonathan
- * Added diagnostic record for invalid relevant document.
- *
- * Revision 1.42 92/02/12 17:30:20 jonathan
- * Conditionalized inclusion of object code.
- *
- * Revision 1.41 92/02/12 17:04:03 jonathan
- * Moved logging info around.
- *
- * Revision 1.40 92/02/12 15:26:35 morris
- * only call fnished_search_word when the preceeding search was successful
- *
- * Revision 1.39 92/02/12 13:30:39 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- * changes 5.2.90 HWM
- - changed calls to perror() to calls to panic()
- - made print_best_hits() only print hits w/ non-zero weight
- - made random arrays static instead of reading them in.
- removed getRandomArray.
- - removed unused variables
- Brewster 7/90 made look_up_word_in_dictionary safer.
- Brewster 7/90 elimiated trailing <lf> on filename and headline table accesses
- HWM 7.12.90 - replaced all calls to panic with error code returns and a log
- file
- - added the routine initSearchEngine() which should be called
- before any other search routine
- - added beFriendly() to give other processes time under
- multifinder
- JG 5.31.91 - added relevance feedback for line fragments.
- JG 7.8.91 - added doc_id to search_for_words, removed scale_scores.
- */
-
- #if 0
- #define GET_QUERY_TIMING
- #endif
-
- #define _search_c
-
- #ifdef __osf__
- #include <ctype.h>
- #endif /* ndef __osf__ */
-
- #include <string.h> /* for strlen() */
- #ifdef THINK_C
- #include <unix.h> /* for sleep() */
- #endif /* think_c */
-
- #include "cutil.h"
- #include "irfiles.h"
- #include "irtfiles.h" /* for map_over_words */
- #include "irlex.h"
- #include "irext.h"
- #include "irsearch.h"
- #include "docid.h"
- #include <math.h>
- #include "irretrvl.h"
- #ifdef BOOL
- #include "irparse.h"
- #endif
- #ifdef SOUND
- #include "soundex.h"
- #endif
-
- /* tung , 10/93 */
- #ifdef BOOLEANS
- #include "manipulstr.h"
- #endif
- /* tung */
-
- #include "trie.h"
-
- #ifdef WIN32
- #include <ctype.h>
- void dispose_trie_allocator(trie_allocator*);
- long previous_docid(char*,database*);
- long next_docid(char*,database*);
- int encode(unsigned char*);
- void decode(unsigned char*);
- void panic(char*,...);
- #endif
-
- #define TEST_SEARCH false /* set to TRUE to allow printing to console */
-
- #ifdef STELAR
- /* File of associations between STELAR abstracts and bitmaps */
- FILE *BITMAPS;
- #endif /* STELAR */
-
- char *server_name = NULL;
- long tcp_port = 0;
-
- long _BE_normalized = 0;
-
- #ifdef GET_QUERY_TIMING
- #include <sys/timeb.h>
- static struct timeb s_time, e_time;
- static float t_time = 0;
- static long n_query = 0;
- #endif
-
-
- /*----------------------------------------------------------------------*/
-
- static Boolean calcDocLength _AP((hit* theHit,long* lines,long* bytes));
-
- static Boolean
- calcDocLength(theHit,lines,bytes)
- hit* theHit;
- long* lines;
- long* bytes;
- /* Given a hit, open the file and figure out how many bytes and lines
- it contains. This is not needed by the serial search engine (it
- stores these values in its dictionary. It is used by the dynamic
- help facility).
- */
- {
- *lines = theHit->number_of_lines;
-
- /* find the length of the document */
- if(theHit->end_character != 0)
- {
- /* document is not whole file, so size is stored */
- *bytes = theHit->end_character - theHit->start_character;
- return(true);
- }
- else
- {
- /* whole file, find file length from the file */
- FILE* file = NULL;
- if (((file = s_fopen(theHit->filename, "r")) != NULL) &&
- (s_fseek(file, 0L, SEEK_END) == 0) &&
- ((*bytes = ftell(file)) != -1))
- { s_fclose(file);
- return(true); /* we are done, bytes is set */
- }
- else
- { s_fclose(file);
- return(false); /* something went wrong with the file */
- }
- }
- }
-
-
-
- static long search_word_before_pairs _AP((char *word, long char_pos,
- long line_pos, long weight,
- long doc_id, time_t doc_date,
- boolean capitalized, database* db));
-
- /* returns 0 is successful, non-0 if error. A copy of add_word_before_pairs */
- static long search_word_before_pairs (word, char_pos, line_pos,
- weight, doc_id, doc_date, capitalized, db)
- char *word; /* the word to be indexed, this could be a
- word pair. If NULL there are no more words
- to be indexed */
- long char_pos; /* the position of the start of the
- word */
- long line_pos; /* this is passed for the best
- section calculation */
- long weight; /* how important the word looks
- syntactically (such as is it bold)
- NOT used by signature system */
- long doc_id; /* current document, this will never be 0 */
- time_t doc_date; /* display day of this document, 0 if not known */
- boolean capitalized; /* if the word started with a cap */
- database* db; /* database to insert the document */
- {
- static char last_word[MAX_WORD_LENGTH + 1];
- static long last_doc_id = -1;
- /* The way it works is it remembers if the last word if it was
- capitalized (if not it clears the saved word).
- If another capitalized word comes along next
- (and it is in the same document), then it makes a joint word and calls
- add_word with it. */
- if(capitalized){
- if(last_word[0] != '\0' && last_doc_id == doc_id){
- search_word(make_joint_word(last_word, word),
- char_pos, line_pos, weight, doc_id, 1L, db);
- }
- else{
- last_word[0] = '\0';
- }
- strncpy(last_word, word, MAX_WORD_LENGTH);
- last_doc_id = doc_id;
- }
- else{ /* not capitalized */
- last_word[0] = '\0';
- }
- return(search_word(word, char_pos,
- line_pos, weight, doc_id, 0L, db));
- }
-
- long count_trie_words;
- long count_uniq;
-
- boolean prepare_word_list(words,set,alloc)
- char* words;
- trie* set;
- trie_allocator* alloc;
- {
- char* word = NULL;
- int * datum;
- count_trie_words = count_uniq = 0;
- /* printf("words: %s\n", words); */
- #ifdef BIO
- word = (char*)strtokf(words,wordDelimiter);
- #else
- word = (char*)strtokf_isalnum(words);
- #endif
- while(word != NULL){
- #ifndef WIN32
- long dictionary_value;
- #endif
- /* trim the string if necessary */
- if(strlen(word) > MAX_WORD_LENGTH){
- word[MAX_WORD_LENGTH] = '\0';
- }
-
- if(!encode((unsigned char*)word)) {
- panic("can't encode word %s",word);
- }
- datum = (int *)trie_lookup(word,set,alloc);
- if(!datum) {
- panic("trie_lookup failed !!!");
- }
-
- count_trie_words++;
-
- *datum += 1;
-
- if (*datum == 1 ) {
- count_uniq++;
- }
- #ifdef BIO
- word = (char *)strtokf(NULL,wordDelimiter);
- #else
- word = (char *)strtokf_isalnum(NULL);
- #endif
- beFriendly();
- }
-
- waislog(WLOG_LOW, WLOG_INFO,
- "after preparing word list, %d search items were presented.",
- count_trie_words);
- waislog(WLOG_LOW, WLOG_INFO,
- "There are %d words to search for.",
- count_uniq);
-
- return(true);
- }
-
- boolean search_for_trie_words(dict,db,prefix,docid,result)
- trie* dict;
- database* db;
- char* prefix;
- long docid;
- boolean result;
- {
- char buffer[MAX_WORD_LENGTH+1];
- char tmp_word[MAX_WORD_LENGTH+1];
- char* word;
- #ifndef WIN32
- long dictionary_value;
- int weight;
- #endif
- char* tmp=word;
- if (dict == NULL) {
- return result;
- }
- if (*dict->string) {
- strcpy(buffer,prefix);
- strcat(buffer,dict->string);
- word = buffer;
- } else {
- word = prefix;
- }
-
- if (dict->datum) {
- #ifndef WIN32
- long number_of_occurrences;
- #endif
- /* this node has data */
- strcpy(tmp_word,word);
- decode(tmp_word);
- result |= search_word(tmp_word,0L,0L,1L,docid,0L,db);
- }
- if (dict->table) {
- int i;
- int len;
- len = strlen(word);
- for (i=0;i<ALPHA_SIZE;i++) {
- if(dict->table[i]) {
- word[len]=(char)i;
- word[len+1]='\0';
- result = search_for_trie_words(dict->table[i],db,word,docid,result);
- }
- }
- }
- return result;
- }
-
-
- /* dgg -- pulled this from irtfiles.c:map_over_words */
- /* returns the number of words added, or -1 if an error occurred */
- long search_over_words
- _AP((char* line,long document_id,database* db,char* words_used));
-
- long search_over_words(line, document_id, db, words_used)
- char* line;
- long document_id;
- database* db;
- char* words_used;
- {
- long weight = 1L;
- long file_position_before_line = 0;
- boolean word_position= false;
- boolean word_pairs= false;
- #ifdef BIO
- int minwordlen= 1; /* only if symbols are active ? */
- #else
- int minwordlen= 2;
- #endif
-
- long position_in_word = 0;
- long word_count = 0;
- unsigned long ch;
- long char_count = 0;
- boolean capitalized = false; /* if the word starts with a cap */
- #ifdef LITERAL
- char word[MAX_PHRASE_LENGTH + 1];
- char key;
- #else
- char word[MAX_WORD_LENGTH + 1];
- #endif
- #ifdef BOOLEANS
- #define MAX_LINE_LENGTH 1000
- boolean nextIsNot = false;
- char notwords[MAX_LINE_LENGTH+1];
- boolean parentheses = false; /* 10/93, tung */
- #endif
- #ifdef SOUND
- boolean nextIsSoundex = false;
- boolean nextIsPhonix = false;
- #endif
-
- #ifdef BOOLEANS
- notwords[0]= '\0';
-
- /* tung, 10/93
- for boolean search with parentheses "(... and ... )".
- There may but need not be blanks (or tabs) around the parentheses.
- */
- if((strchr(line, '(' )) && (strchr(line, ')' ))) {
- parentheses = true;
- line = string_manipulation(line);
- minwordlen = 1; /* e.g. for query (a or information) */
- }
- /* tung, 10/93 */
- #endif
-
-
- for(ch = (unsigned char)line[char_count++];
- ch != '\0'; ch = (unsigned char)line[char_count++]){
- #ifdef BIO
- boolean alnum = (wordDelimiter(ch) == NOT_DELIMITER);
- #else
- boolean alnum = isalnum(ch);
- #endif
- #ifdef PARTIALWORD
- if (ch == PARTWORD_WILDCARD) { alnum= true; minwordlen= MAX(2,minwordlen); }
- #endif
- #ifdef LITERAL
- if (ch == LITERAL_KEY1) key= LITERAL_KEY1;
- else if (ch == LITERAL_KEY2) key= LITERAL_KEY2;
- else { key= 0; weight = 1L ;} /* by only key=0 is it not possible
- combine literal search and boolean search
- */
- if (key != 0) {
- char *cp, *match;
- cp = line + char_count;
- match = strchr( cp, key);
- /* printf("search_over_words: literal key is [%c]\n", key); */
- if (match != NULL && cp < match) {
- for (position_in_word=0; cp < match; cp++, char_count++)
- if (position_in_word < MAX_PHRASE_LENGTH) {
- word[position_in_word++] = char_downcase((unsigned long)*cp);
- }
- char_count++; /* skip closing key */
- alnum= false;
- capitalized= false;
- weight= LITERAL_FLAG; /* is this a safe flag parameter? --
- unused but passed on to search_word is what we need */
- /* !! need to break literal "word" into 1st dictionary word and search
- on that... */
- /* printf("search_over_words: literal is [%s]\n", word); */
- }
- }
- #endif
-
- if(alnum){
- /* put the character in the word if not too long */
- if(position_in_word == 0)
- capitalized = isupper((unsigned long)ch)?true:false;
- if(position_in_word < MAX_WORD_LENGTH){
- word[position_in_word++] = char_downcase((unsigned long)ch);
- }
- }
- else{ /* not an in a word */
- if(position_in_word != 0){
- #ifdef BOOLEANS
- /* note on BOOLEANS -- we really need to check for NOT words here,
- and move them to back of line so that (wordfunction)== search_word is
- called for NOT words after other words (excluding NOT inside a literal)
- */
- if (nextIsNot) {
- word[position_in_word] = '\0';
- strcat( notwords, word);
- strcat( notwords, " ");
- nextIsNot= false;
- word_count++;
- }
- else if ((strncmp(word,BOOLEAN_NOT,position_in_word)==0)) {
- if(parentheses == false) { /* tung, 10/93 */
- nextIsNot= true;
- word_count++;
- }
- else { /* tung, 10/93 */
- word[position_in_word] = '\0';
- if(0 != search_word_before_pairs
- (word,
- file_position_before_line + char_count,
- /*^^ this param is supposed to be start-of-word, but char_count is now at end-of-word !*/
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db))
- return(-1); /* error */
- nextIsNot= false;
- word_count++;
- }
- }
- else
- #endif
- #ifdef SOUND
- if (nextIsSoundex) {
- static char Key[80];
- word[position_in_word] = '\0';
- SoundexCode(word,Key);
- #ifdef DEBUG
- printf("search_over_words: adding Soundex (%s,%s)\n", word, Key);
- #endif
- if(0 != search_word_before_pairs
- (Key,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db))
- return(-1);
- word_count++; nextIsSoundex = false;
- } else if ((strncmp(word,SOUNDEX,strlen(SOUNDEX))==0)) {
- nextIsSoundex= true;
- word_count++;
- }
- else if (nextIsPhonix) {
- char Key[80];
- word[position_in_word] = '\0';
- PhonixCode(word,Key);
- #ifdef DEBUG
- printf("search_over_words: adding Phonix (%s,%s)\n", word, Key);
- #endif
- if(0 != search_word_before_pairs
- (Key,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db))
- return(-1);
- word_count++; nextIsPhonix = false;
- } else if ((strncmp(word,PHONIX,strlen(PHONIX))==0)) {
- nextIsPhonix= true;
- word_count++;
- } else
- #endif
- /* then we have collected a word */
- if(position_in_word >= minwordlen){ /* is it reasonable ? */
- long result;
- word[position_in_word] = '\0';
- if(0 > (result =
- search_word_before_pairs(word,
- file_position_before_line + char_count,
- /*^^ this param is supposed to be start-of-word, but char_count is now at end-of-word !*/
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db)))
- return(-1); /* error */
- if(result == 1 && words_used != NULL) {
- strcat(words_used, word);
- strcat(words_used, " ");
- }
- #ifdef BOOLEANS
- nextIsNot= false;
- #endif
- #ifdef SOUND
- nextIsPhonix = false;
- nextIsSoundex = false;
- #endif
- word_count++;
- }
- position_in_word = 0;
- }
- }
- }
-
- /* finish last word */
-
- #ifdef BOOLEANS
- if (nextIsNot) {
- word[position_in_word] = '\0';
- strcat( notwords, word);
- strcat( notwords, " ");
- nextIsNot= false;
- word_count++;
- }
- else
- #endif
- #ifdef SOUND
- if (nextIsSoundex) {
- static char Key[80];
- long result;
- word[position_in_word] = '\0';
- SoundexCode(word,Key);
- #ifdef DEBUG
- printf("search_over_words: adding Soundex (%s,%s)\n", word, Key);
- #endif
- if(0 > (result =
- search_word_before_pairs(Key,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db)))
- return(-1);
- word_count++;
- }
- else if (nextIsPhonix) {
- char Key[80];
- long result;
- word[position_in_word] = '\0';
- PhonixCode(word,Key);
- #ifdef DEBUG
- printf("search_over_words: adding Phonix (%s,%s)\n", word, Key);
- #endif
- if(0 > (result =
- search_word_before_pairs(Key,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db)))
- return(-1);
- if (result == 1 && words_used != NULL) {
- strcat(words_used, word);
- strcat(words_used, " ");
- }
- word_count++;
- }
- else
- #endif
- if(position_in_word >= minwordlen){ /* is it reasonable ? */
- word[position_in_word] = '\0';
- if(0 > search_word_before_pairs(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db))
- return(-1);
- word_count++;
- }
-
- #ifdef BOOLEANS
- if ((notwords[0] != '\0')) {
- char wordn[MAX_WORD_LENGTH+1]; /* this is a copy of wordp */
- char *wordp;
- capitalized= false;
- char_count= 0; /* !?? need char index for each word ? */
- weight= BOOLEAN_NOT_FLAG; /* is this a safe parameter ? */
- wordp= strtok( notwords, " ");
- while (wordp!=NULL) {
- s_strncpy(wordn,wordp,MAX_WORD_LENGTH);
- if(0 >
- search_word_before_pairs(wordn,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- capitalized,
- db))
- return(-1);
- wordp= strtok(NULL, " ");
- }
- }
- #endif
- return(word_count);
- }
-
-
- boolean search_for_words(words, db, doc_id, words_used)
- char* words;
- /* break the string into words (using map_over_words)
- and repeatedly call search_word_before_pairs().
- Returns true if successful.
- */
- database *db;
- long doc_id; /* = 1 for words == document in relevance feedback search */
- char *words_used;
- {
-
- #ifdef BOOL
- /* LISP QUERY */
- if( words[0] == '(' ){ /* then it is a lisp query */
- /* this is a temporary stub for the real work */
- char error_string[ERROR_STRING_LEN];
- object* query = (object*)parseQuery(words,QUERY_SYNTAX_LISP,error_string);
- if(query == NULL){
- waislog(WLOG_HIGH, WLOG_ERROR, "Unparsable query %s", error_string);
- return(false);
- }
- else{
- query = (object*)send(query,Evaluate,db);
- return(true);
- }
- }
- #endif /* def BOOL */
-
- /* dgg mods really need new version of map_over_words for searching only
- (not for adding == indexing), and this way we can keep main search
- routines here (irsearch.c) & search_word in sersrch.c
- */
-
- /* NORMAL QUERY */
- if( -1 == search_over_words(words, doc_id, db, words_used))
- return(false);
- else
- return(true);
- }
-
-
- /* gets the next best hit from the search engine and fills in all the slots.
- If the document does not exist, then it gets another, etc.
- It returns 0 if successful */
- long next_best_hit(the_best_hit, db)
- hit *the_best_hit;
- database *db;
- {
- document_table_entry doc_entry;
- long ret_value,start,end,length,date,nlines=0;
- char headline[100];
- char filename[200],type[20];
- while(1){ /* keep going until we get a good document */
- if(0 != (ret_value = best_hit(db,&(the_best_hit->document_id),
- &(the_best_hit->best_character),
- &(the_best_hit->best_line),
- &(the_best_hit->weight),
- &start,&end,&date,&length,&nlines,
- headline,filename,type))){
- /* if we add start,end,length,date,headline here, we can remove lots of
- disk reads */
-
- return(ret_value);
- }
- if(the_best_hit->weight <= 0.0) /* if we are out of good stuff, return */
- return(1);
- /* fill in the rest of the hit [uwo] do this with a read...
- the_best_hit->start_character = start;
- the_best_hit->end_character = end;
- the_best_hit->document_length = length;
- the_best_hit->number_of_lines = nlines; [uwo] handle these below */
- strcpy(the_best_hit->filename,filename);
- strcpy(the_best_hit->type,type);
- strcpy(the_best_hit->headline,headline);
- /* sprintf(the_best_hit->date, "%d", date); [uwo] handled below */
- /* do we need this step?? JMF -- It would seem so. Peter Marshall [uwo]. */
- if (read_document_table_entry(&doc_entry, the_best_hit->document_id, db)){
- the_best_hit->start_character = doc_entry.start_character;
- the_best_hit->end_character = doc_entry.end_character;
- the_best_hit->document_length = doc_entry.document_length;
- the_best_hit->number_of_lines = doc_entry.number_of_lines;
- sprintf(the_best_hit->date, "%d", doc_entry.date);
- /* This doesn't seem to be necessary, though -- Peter Marshall [uwo]
- read_filename_table_entry(doc_entry.filename_id,
- the_best_hit->filename,
- the_best_hit->type,
- NULL,
- db),
- strncpy(the_best_hit->headline,
- read_headline_table_entry(doc_entry.headline_id,db),
- MAX_HEADLINE_LEN);
- [uwo] */
- #ifdef WIN32
- /* We don't check for the existence of the file, because it might have
- * a relative path name and we might not be in the right directory for
- * that relative path name to work. Someone higher up can worry about
- * that. */
- return 0;
- #else
- if(probe_file_possibly_compressed(the_best_hit->filename))
- return(0);
- else {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "Dangling File %s in database %s.",
- the_best_hit->filename,
- db->database_file);
- }
- #endif
- }
- else {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Error reading doc_table_entry for database %s, docid: %ld",
- db->database_file,
- the_best_hit->document_id);
- }
- beFriendly();
- }
- }
-
- /*-New one---------------------------------------------------------------------*/
-
- /* this function figures out if the request is for a NEXT or Previous document.
- If it is, then it makes a header for it and returns it. If not, then it
- returns NULL. */
-
- WAISDocumentHeader*
- handle_next_and_previous(docs, db, waisProtocolVersion, server)
- DocObj** docs;
- database* db;
- long waisProtocolVersion;
- char* server;
- {
- char* dbName = db->database_file;
- WAISDocumentHeader* header;
- DocID* theDocID = NULL;
- char *local_id;
-
- long count,i;
- char* tmp_type = NULL; /* temporary type */
- char* tmp_type_pointer = NULL; /* temporary type pointer */
-
-
- if(docs != NULL) { /* All of this is for WAIS_Prev and WAIS_next */
- if(docs[0] != NULL && docs[0]->Type != NULL) {
- long id = -1;
-
- if((theDocID = docIDFromAny(docs[0]->DocumentID)) == NULL) {
- waislog(WLOG_HIGH, WLOG_WARNING, "can't parse docid");
- return(NULL);
- }
-
- local_id = anyToString(GetLocalID(theDocID));
-
- if(strcmp(docs[0]->Type,"WAIS_NEXT") == 0) /* next page */
- id = next_docid(local_id,db);
- else if(strcmp(docs[0]->Type,"WAIS_PREV") == 0) /* prev page */
- id = previous_docid(local_id, db);
-
- freeDocID(theDocID); s_free(local_id);
-
- if (id > -1) {
- document_table_entry doc_entry;
- hit foo;
- long lines,length;
- char local_id[MAX_FILENAME_LEN + 60]; /* filename, start, end */
-
- local_id[0] = '\0';
-
- if (read_document_table_entry(&doc_entry, id, db) == true) {
- foo.start_character = doc_entry.start_character;
- foo.end_character = doc_entry.end_character;
- foo.document_length = doc_entry.document_length;
- foo.number_of_lines = doc_entry.number_of_lines;
-
- read_filename_table_entry(doc_entry.filename_id,
- foo.filename,
- foo.type,
- NULL,
- db),
- strncpy(foo.headline,
- read_headline_table_entry(doc_entry.headline_id,db),
- MAX_HEADLINE_LEN);
- sprintf(foo.date, "%d", doc_entry.date);
- sprintf(local_id, "%ld %ld %s",
- doc_entry.start_character,
- doc_entry.end_character,
- foo.filename);
-
- if(calcDocLength(&(foo),&lines,&length)){
- /* this document is good, return it */
- char** type = NULL;
-
- /* multitype extensions */
- /*
- Need to parse out the document types and add them to the
- document type list.
- */
- if (waisProtocolVersion >= '2') {
-
- /* I left this conditional here (it is not really needed, the
- 'else' part could take care of both) on the assumption that
- it would be faster for single type documents.
- */
- if ( strstr(foo.type,",") == NULL ) {
- type = (char**)s_malloc((size_t)(sizeof(char*) * 2));
- type[0] = s_strdup(foo.type);
- type[1] = NULL;
- }
- else {
- /* count up the number of document types */
- count = 1L;
- #ifdef WIN32
- for (i = 0L; i < (long)strlen(foo.type); i++){
- #else
- for (i = 0L; i < strlen(foo.type); i++){
- #endif
- if ( foo.type[i] == ',' )
- count++;
- }
-
-
- /* allocate space for types */
- type = (char**)s_malloc((size_t)(sizeof(char*) * (count + 1L)));
-
- /* duplicate the type and save the pointer */
- tmp_type = s_strdup(foo.type);
- tmp_type_pointer = tmp_type;
-
-
- /* add types - NULL out the pointer so that strtok can grab the subsequent entries */
- for (i = 0L; i < count; i++ ) {
- type[i] = s_strdup(strtok(tmp_type_pointer,","));
- tmp_type_pointer = NULL;
- }
-
- /* add the terminating null */
- type[count] = NULL;
-
- /* release the tmp_type allocations */
- s_free(tmp_type);
-
- }
-
- }
-
- theDocID = makeDocID();
-
- theDocID->distributorServer = stringToAny(server);
- theDocID->originalServer = stringToAny(server);
- theDocID->distributorDatabase = stringToAny(dbName);
- theDocID->originalDatabase = stringToAny(dbName);
- theDocID->distributorLocalID = stringToAny(local_id);
- theDocID->originalLocalID = stringToAny(local_id);
-
- header=
- makeWAISDocumentHeader(anyFromDocID(theDocID),
- UNUSED,
- -1L,
- UNUSED,
- length,lines,
- type,
- s_strdup(dbName),
- s_strdup(foo.date),
- s_strdup(foo.headline),
- NULL);
- freeDocID(theDocID);
- return(header);
- }
- else{
- waislog(WLOG_HIGH, WLOG_WARNING,
- "document <%ld %ld %s> skipped.",
- doc_entry.start_character,
- doc_entry.end_character,
- foo.filename);
- return(NULL);
- }
- }
- }
- }
- }
- return(NULL);
- }
-
-
- /*----------------------------------------------------------------------*/
- /* search for each of the words in a document, up to a limit.
- this is for relevance feedback. */
-
- #define MAX_TEXT_SIZE 100000 /* Maximume size of relevant text */
-
- /* returns true if it added the words, false otherwise (not necessarily
- an error) */
- boolean search_for_words_in_document(doc, docid, db, diags, num_diags)
- DocObj* doc;
- long docid;
- database* db;
- diagnosticRecord*** diags; /* list of diagnostics */
- long *num_diags;
- {
- char * dbName = db->database_file;
- long errorCode;
- WAISDocumentText* doctext;
-
- char prefix[MAX_WORD_LENGTH+1];
- trie *the_dict;
- trie_allocator* alloc;
- count_trie_words =0;
- count_uniq=0;
-
- alloc=make_trie_allocator();
- the_dict = new_trie("",alloc);
- *prefix = 0;
-
- if(doc->Type == NULL ||
- substrcmp(doc->Type,"TEXT") ||
- strcmp(doc->Type,"WSRC") == 0 ||
- strcmp(doc->Type,"WCAT") == 0 ||
- doc->Type[0] == 0) {
-
- doctext = NULL;
- if (doc->ChunkCode == CT_line)
- doctext = getDocumentText(doc, &errorCode, NULL);
- else if ((doc->ChunkCode == CT_byte) ||
- (doc->ChunkCode == CT_document))
- doctext = getData(doc, &errorCode, NULL);
- if (doctext != NULL) {
-
- boolean search_result;
-
- if(doctext->DocumentText->size > MAX_TEXT_SIZE)
- doctext->DocumentText->bytes[MAX_TEXT_SIZE] = 0;
- search_result = prepare_word_list(doctext->DocumentText->bytes,the_dict,alloc);
- search_result |= search_for_trie_words(the_dict,db,prefix,docid,search_result);
- dispose_trie_allocator(alloc);
-
- freeWAISDocumentText(doctext);
- return(search_result);
- }
- else { /* bad docid? */
- DocID* theDocID = NULL;
- char* local_id = NULL;
- diagnosticRecord* diag = NULL;
- char msg[MAX_FILENAME_LEN * 2];
-
- theDocID = docIDFromAny(doc->DocumentID);
-
- if(theDocID == NULL) {
- local_id = s_strdup("can't parse docid");
- }
- else {
- local_id = anyToString(GetLocalID(theDocID));
-
- freeDocID(theDocID);
- }
- waislog(WLOG_HIGH, WLOG_WARNING,
- "Relevance Feedback with invalid doc-id: '%s'",
- local_id);
- strncpy(msg,"Relevant Document not available: ",
- MAX_FILENAME_LEN);
- s_strncat(msg,local_id,MAX_FILENAME_LEN,MAX_FILENAME_LEN);
- s_free(local_id);
- (*num_diags)++;
- diag = makeDiag(true,D_TemporarySystemError,msg);
- *diags = (diagnosticRecord**)s_realloc(*diags,(size_t)(sizeof(diagnosticRecord*) * *num_diags));
- (*diags)[(*num_diags)-1] = diag;
- }
-
- }
- return(false);
- }
-
-
- /*----------------------------------------------------------------------*/
- #ifdef FIXDATE
- FixDate(date)
- char *date;
- {
- long ndate;
- int year, month, day;
- ndate = atoi(date);
- year = ndate/10000;
- month = (ndate-10000*year)/100;
- day = (ndate-10000*year-100*month);
- #ifdef DEBUG
- fprintf(stderr, "FixDate: %ld (%d/%d/%d)\n", ndate, year, month, day);
- fprintf(stderr, "FixDate: %s\n", date);
- #endif
- if (year>100) strcpy(date,"0");
- else if (month>12) strcpy(date,"0");
- else if (day>31) strcpy(date,"0");
- }
- #endif
-
- #ifdef ONELINELENGTH
- FixLength(length)
- int *length;
- {
- if (*length<MAX_HEADLINE_LEN)
- *length=0;
- }
- #endif
-
- WAISDocumentHeader*
- best_hit_to_header(best_hit, maxRawScore, waisProtocolVersion, server, db)
- hit* best_hit;
- long maxRawScore;
- long waisProtocolVersion;
- char *server;
- database* db;
- {
- long lines,length,count,i;
- DocID* theDocID = NULL;
- WAISDocumentHeader* header;
- char* originName = db->database_file;
- char local_id[MAX_FILENAME_LEN + 60]; /* filename, start, end */
- char* tmp_type = NULL; /* temporary type */
- char* tmp_type_pointer = NULL; /* temporary type pointer */
- local_id[0] = '\0';
-
- if (true == calcDocLength(best_hit,&lines,&length))
- { /* this document is good, return it */
- char** type = NULL;
- long normalScore;
- if ( _BE_normalized )
- #ifdef WIN32
- normalScore = (long)(best_hit->weight);
- #else
- normalScore = best_hit->weight;
- #endif
- else {
- normalScore = (long)floor(
- (((double)best_hit->weight) /
- ((double)maxRawScore)) *
- (MAX_NORMAL_SCORE + 1));
-
- if (normalScore > MAX_NORMAL_SCORE)
- normalScore = MAX_NORMAL_SCORE;
- }
-
- sprintf(local_id, "%ld %ld %s",
- best_hit->start_character,
- best_hit->end_character,
- best_hit->filename);
-
- /* multitype extensions */
- /*
- Need to parse out the document types and add them to the
- document type list.
- */
- if (waisProtocolVersion >= '2') {
-
- /* I left this conditional here (it is not really needed, the
- * 'else' part could take care of both) on the assumption that
- * it would be faster for single type documents.
- */
- if ( strstr(best_hit->type,",") == NULL ) {
- type = (char**)s_malloc((size_t)(sizeof(char*) * 2));
- type[0] = s_strdup(best_hit->type);
- type[1] = NULL;
- } else {
- /* Check for multiple types, including STELAR */
- #ifdef STELAR
- /*
- * Use originName to decide whether to look for bitmaps (only database
- * 'stelar' makes associations between abstracts and bitmaps).
- */
- char *dbname;
- if ((dbname = strrchr(originName, '/')) != NULL)
- dbname++;
- else
- dbname = originName;
- if ((!strcasecmp(dbname, "STELAR")
- || !strcasecmp(dbname, "STELAR-DB")) && BITMAPS) {
- char *absname, *bitname;
- char tmpname[80], inrec[80];
- int found = 0;
- if ((absname = strrchr(best_hit->filename, '/')) != NULL)
- strcpy(tmpname, ++absname);
- else
- strcpy(tmpname, best_hit->filename);
- absname = strtok(tmpname, ".");
- rewind(BITMAPS);
- do {
- fgets(inrec, 80, BITMAPS);
- bitname = strtok(inrec, "\n");
- if (!strcasecmp(absname, bitname)) {
- found = 1;
- break;
- }
- } while(!feof(BITMAPS));
-
- if (found) {
- waislog(WLOG_LOW, WLOG_RESULTS,
- "Bitmap association: %s", absname);
- type = (char**)s_malloc((size_t)(sizeof(char*) * 4));
- type[0] = s_strdup(best_hit->type);
- type[1] = s_strdup("TIFF");
- type[2] = s_strdup("ARTICLE_TIFF");
- type[3] = NULL;
- length = 1024L*1024L;
- } else {
- type = (char**)s_malloc((size_t)(sizeof(char*) * 2));
- type[0] = s_strdup(best_hit->type);
- type[1] = NULL;
- }
- } else { /* It's not the STELAR database */
- #endif /* STELAR */
- /* count up the number of document types */
- count = 1L;
- #ifdef WIN32
- for (i = 0L; i < (long)strlen(best_hit->type); i++){
- #else
- for (i = 0L; i < strlen(best_hit->type); i++){
- #endif
- if ( best_hit->type[i] == ',' )
- count++;
- }
-
- /* allocate space for types */
- type = (char**)s_malloc((size_t)(sizeof(char*) * (count + 1L)));
-
- /* duplicate the type and save the pointer */
- tmp_type = s_strdup(best_hit->type);
- tmp_type_pointer = tmp_type;
-
- /* add types - NULL out the pointer so that strtok can grab the subsequent entries */
- for (i = 0L; i < count; i++ ) {
- type[i] = s_strdup(strtok(tmp_type_pointer,","));
- tmp_type_pointer = NULL;
- }
-
- /* add the terminating null */
- type[count] = NULL;
-
- /* release the tmp_type allocations */
- s_free(tmp_type);
- #ifdef STELAR
- }
- #endif /* STELAR */
- }
- }
- else
- type = NULL;
- /*
- printf("header %ld out of %ld\n", *headerNum,
- wais_search->MaxDocumentsRetrieved);
- */
- theDocID = makeDocID();
-
- theDocID->distributorServer = stringToAny(server);
- theDocID->originalServer = stringToAny(server);
-
- theDocID->distributorDatabase = stringToAny(originName);
- theDocID->originalDatabase = stringToAny(originName);
-
- theDocID->distributorLocalID = stringToAny(local_id);
- theDocID->originalLocalID = stringToAny(local_id);
- #ifdef FIXDATE
- FixDate(best_hit->date);
- #endif
- #ifdef ONELINELENGTH
- FixLength(&length);
- #endif
- header =
- makeWAISDocumentHeader(anyFromDocID(theDocID),
- UNUSED,
- (long)normalScore,
- best_hit->best_line,
- length,lines,
- type,
- s_strdup(originName),
- s_strdup(best_hit->date),
- s_strdup(best_hit->headline),
- NULL);
- freeDocID(theDocID);
- return(header);
- }
- else
- {
- waislog(WLOG_HIGH, WLOG_WARNING,
- "document <%ld %ld %s> skipped.",
- best_hit->start_character,
- best_hit->end_character,
- best_hit->filename);
- return(NULL);
- }
- }
-
-
-
- /*----------------------------------------------------------------------*/
-
- boolean run_search(aSearch, headers, diags, index_directory,
- seed_words_used, waisProtocolVersion, headerNum)
- SearchAPDU* aSearch;
- WAISDocumentHeader** headers; /* list of results */
- diagnosticRecord*** diags; /* list of diagnostics */
- char *index_directory;
- char **seed_words_used; /* called with enough space */
- long waisProtocolVersion;
- long *headerNum;
- /* runs a search on the inverted file index and returns false if it errors
- in such a way that it can not even make a diagnostic record
- (should not happen).
- It changes headers with the replies or makes a diagnostic record
- */
- {
- diagnosticRecord* diag = NULL;
- WAISSearch* wais_search = (WAISSearch*)aSearch->Query; /* for convenience */
- database* db = NULL;
- long maxRawScore;
- long i;
- query_parameter_type parameters;
- boolean search_result;
- char server[255];
- WAISDocumentHeader* header;
- long num_diags = 0;
- char dbName[MAX_FILENAME_LEN * 2];
-
- if (aSearch->DatabaseNames == NULL)
- strcpy(dbName,merge_pathnames(INFO_DATABASE_NAME, index_directory));
- else
- strcpy(dbName,merge_pathnames(aSearch->DatabaseNames[0], index_directory));
-
- #ifdef GET_QUERY_TIMING
- ftime(&s_time);
- #endif
-
- /* strlip .src if it is on the name */
- if(strlen(dbName) > strlen(".src"))
- if(0 == strcmp(dbName + strlen(dbName) - strlen(".src"),
- ".src"))
- dbName[strlen(dbName) - strlen(".src")] = '\0';
-
- if(server_name != NULL)
- sprintf(server, "%s:%d", server_name, tcp_port);
- else
- sprintf(server, "localhost:0");
-
- db = openDatabase(dbName, false, true);
- if (db == NULL){
- char msg[MAX_FILENAME_LEN * 2];
- strncpy(msg,"The following database is not available: ",
- MAX_FILENAME_LEN);
- s_strncat(msg,dbName,MAX_FILENAME_LEN,MAX_FILENAME_LEN);
- diag = makeDiag(false,D_PermanentSystemError,msg);
- *diags = (diagnosticRecord **)s_realloc(*diags,(size_t)(sizeof(diagnosticRecord*) * 2));
- (*diags)[0] = diag;
- (*diags)[1] = NULL;
- return(false);
- }
-
- #ifdef BIO /* dgg */
- {
- char *cp= read_delimiters( db); /* use data-specific delimiters, if available */
- if (cp != NULL) {
- strcpy( gDelimiters, cp);
- wordDelimiter= wordbreak_user;
- }
- else
- wordDelimiter= wordbreak_notalnum;
- }
- #else
- wordDelimiter= wordbreak_notalnum; /* actually, wordDelimeter is used only ifdef BIO ? */
- #endif
-
- /* figure out if it is a NEXT or PREVIOUS, if so, return it. */
- header = handle_next_and_previous(wais_search->Docs, db,
- waisProtocolVersion, server);
- if(header != NULL){
- headers[(*headerNum)++] = header;
- headers[*headerNum] = NULL;
- return(true);
- }
-
- /* until seed_words_used is supported */
- /* strcpy(*seed_words_used, wais_search->SeedWords); */
-
- if (seed_words_used != NULL) *seed_words_used[0] = 0;
-
- parameters.max_hit_retrieved = wais_search->MaxDocumentsRetrieved;
- set_query_parameter(SET_MAX_RETRIEVED_MASK, ¶meters);
-
- search_result = false;
- init_search_word(db);
-
- #ifdef RELEVANCE_FEEDBACK
- if(wais_search->Docs != NULL) {
- DocObj* doc = NULL;
- boolean res;
- /* assemble the elements and construct a response */
- for (i = 0, doc = wais_search->Docs[i];
- doc != NULL;
- doc = wais_search->Docs[++i]){
- search_result |=
- search_for_words_in_document(doc,i+1,db,diags,&num_diags);
- }
- if (*diags != NULL) {
- num_diags++;
- *diags = (diagnosticRecord**)s_realloc(*diags,(size_t)(sizeof(diagnosticRecord*) * num_diags));
- (*diags)[num_diags-1] = NULL;
- }
- }
- #endif /* RELEVANT_FEEDBACK */
-
- search_result |= search_for_words(wais_search->SeedWords, db, 0, *seed_words_used);
-
- if (search_result == true){ /* the search went ok */
- hit best_hit;
- finished_search_word(db);
- init_best_hit(db);
- for (i = 0; i < wais_search->MaxDocumentsRetrieved; i++){
- if(0 != next_best_hit(&best_hit, db))
- break; /* out of hits */
- if(i == 0)
- #ifdef WIN32
- maxRawScore = (long)(best_hit.weight);
- #else
- maxRawScore = best_hit.weight;
- #endif
- if (best_hit.weight > 0){
- WAISDocumentHeader* header =
- best_hit_to_header(&best_hit, maxRawScore,
- waisProtocolVersion,server,db);
- if(NULL != header){
- headers[(*headerNum)++] = header;
- headers[*headerNum] = NULL;
- }
- }
- }
- }
- else
- { /* something went awry in the search */
- num_diags++;
- diag = makeDiag(true,D_PermanentSystemError,
- "Serious error in server");
- *diags = (diagnosticRecord**)
- s_realloc(*diags, (size_t)(sizeof(diagnosticRecord*) * num_diags));
- (*diags)[num_diags-2] = diag;
- (*diags)[num_diags-1] = NULL;
- }
- finished_best_hit(db);
- /* free everything */
- closeDatabase(db);
- #ifdef GET_QUERY_TIMING
- ftime(&e_time);
- t_time += (e_time.time + e_time.millitm/1000.0) -
- (s_time.time + s_time.millitm/1000.0);
- n_query++;
- if ( n_query == 200 ) {
- waislog(WLOG_LOW, WLOG_INFO, "searching 200 queries takes %f seconds.",
- t_time);
- waislog(WLOG_LOW, WLOG_INFO, "average %f/query.", t_time/200.0);
- n_query = 0;
- t_time = 0;
- }
- #endif
-
- return(true);
- }
-